options(repos = "http://cran.us.r-project.org") 
#import and load the required packages
if (!require(ggpubr)) {
  install.packages("ggpubr")
}
## Loading required package: ggpubr
## Loading required package: ggplot2
library(ggpubr)
if (!require(tidyverse)) {
  install.packages("tidyverse")
}
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.1.8
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidyverse)

if (!require(ggplot2)) {
  install.packages("ggplot2")
}
library(ggplot2)
if (!require(plotly)) {
  install.packages("plotly")
}
## Loading required package: plotly
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(plotly)


if (!require(moments)) {
install.packages("moments")
}
## Loading required package: moments
library(moments)
if (!require(LambertW)) {
  install.packages("LambertW")
}
## Loading required package: LambertW
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:plotly':
## 
##     select
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## This is 'LambertW' version 0.6.7.1. See the NEWS file and citation("LambertW").
## 
## 
## Attaching package: 'LambertW'
## 
## The following objects are masked from 'package:moments':
## 
##     kurtosis, skewness
if (!require("DT")) install.packages('DT')
## Loading required package: DT
library(DT)
if (!require("LambertW")) install.packages("LambertW")
library(LambertW)

1. Reading the gapminder into a tibble

gapminder <- read_csv("gapminder_clean.csv")
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (%
## of GDP), CO2 emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
gapminder
## # A tibble: 2,607 × 20
##     ...1 Country…¹  Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
##    <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1     0 Afghanis…  1962    NA    0.0738  21.3        NA      NA    4.88    7.45
##  2     1 Afghanis…  1967    NA    0.124    9.92       NA      NA    6.77    7.45
##  3     2 Afghanis…  1972    NA    0.131   18.9        NA      NA   14.8     7.45
##  4     3 Afghanis…  1977    NA    0.183   13.8        NA      NA   11.7     7.45
##  5     4 Afghanis…  1982    NA    0.166   NA          NA      NA   NA       7.45
##  6     5 Afghanis…  1987    NA    0.276   NA          NA      NA   NA       7.46
##  7     6 Afghanis…  1992    NA    0.101   NA          NA      NA   NA       7.50
##  8     7 Afghanis…  1997    NA    0.0608  NA          NA      NA   NA       7.64
##  9     8 Afghanis…  2002    38.5  0.0411  NA          NA      NA   32.4     7.27
## 10     9 Afghanis…  2007    30.6  0.0879   0.535      NA      NA   17.8     6.44
## # … with 2,597 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## #   `Imports of goods and services (% of GDP)` <dbl>,
## #   `Industry, value added (% of GDP)` <dbl>,
## #   `Inflation, GDP deflator (annual %)` <dbl>,
## #   `Life expectancy at birth, total (years)` <dbl>,
## #   `Population density (people per sq. km of land area)` <dbl>,
## #   `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …

2. Filtering and plotting the data

gapminder_filtered <- filter(gapminder, Year == 1962)
gapminder_filtered
## # A tibble: 259 × 20
##     ...1 Country…¹  Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
##    <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1     0 Afghanis…  1962      NA  0.0738    21.3      NA      NA    4.88    7.45
##  2    10 Albania    1962      NA  1.44      NA        NA      NA   NA       6.28
##  3    20 Algeria    1962      NA  0.485     NA        NA      NA   19.8     7.61
##  4    30 American…  1962      NA NA         NA        NA      NA   NA      NA   
##  5    40 Andorra    1962      NA NA         NA        NA      NA   NA      NA   
##  6    50 Angola     1962      NA  0.216     NA        NA      NA   NA       7.40
##  7    60 Antigua …  1962      NA  1.82      NA        NA      NA   NA       4.34
##  8    70 Arab Wor…  1962      NA  0.761     18.2      NA      NA   NA       6.96
##  9    80 Argentina  1962      NA  2.52      17.3      NA      NA    4.69    3.09
## 10    90 Armenia    1962      NA NA         NA        NA      NA   NA       4.43
## # … with 249 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## #   `Imports of goods and services (% of GDP)` <dbl>,
## #   `Industry, value added (% of GDP)` <dbl>,
## #   `Inflation, GDP deflator (annual %)` <dbl>,
## #   `Life expectancy at birth, total (years)` <dbl>,
## #   `Population density (people per sq. km of land area)` <dbl>,
## #   `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
gapminder_plot <- ggplot(gapminder_filtered, aes(x = gdpPercap,
  y = `CO2 emissions (metric tons per capita)`)) +
  geom_point()
  
  ggsave("gapminderplot.png")
## Saving 7 x 5 in image
## Warning: Removed 151 rows containing missing values (`geom_point()`).
gapminder_plot
## Warning: Removed 151 rows containing missing values (`geom_point()`).

3. Finding correlation

correlation <- cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
  gapminder_filtered$gdpPercap, use = "complete.obs")
p_value <- cor.test(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
  gapminder_filtered$gdpPercap)$p.value
correlation
## [1] 0.9260817
p_value
## [1] 1.128679e-46

4 On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…

gapminder_filtered <- gapminder %>%
  filter(!is.na(`CO2 emissions (metric tons per capita)`), !is.na(gdpPercap))

cor_by_year <- gapminder_filtered %>%
  group_by(Year) %>%
  summarise(correlation = cor(`CO2 emissions (metric tons per capita)`, gdpPercap, use = "complete.obs"))
  
cor_by_year
## # A tibble: 10 × 2
##     Year correlation
##    <dbl>       <dbl>
##  1  1962       0.926
##  2  1967       0.939
##  3  1972       0.843
##  4  1977       0.793
##  5  1982       0.817
##  6  1987       0.810
##  7  1992       0.809
##  8  1997       0.808
##  9  2002       0.801
## 10  2007       0.720
strongest_year_row <- cor_by_year %>%
  filter(correlation == max(correlation)) %>%
  slice(1)

strongest_year <- strongest_year_row
strongest_year
## # A tibble: 1 × 2
##    Year correlation
##   <dbl>       <dbl>
## 1  1967       0.939

5 Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.

gapminder_strongest_year <- gapminder %>%
  filter(Year == 1967)

gapminder_strongest_year
## # A tibble: 259 × 20
##     ...1 Country…¹  Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
##    <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1     1 Afghanis…  1967   NA      0.124    9.92      NA      NA    6.77    7.45
##  2    11 Albania    1967   NA      1.36    NA         NA      NA   NA       5.39
##  3    21 Algeria    1967   10.3    0.632   28.0       NA      NA   23.4     7.67
##  4    31 American…  1967   NA     NA       NA         NA      NA   NA      NA   
##  5    41 Andorra    1967   NA     NA       NA         NA      NA   NA      NA   
##  6    51 Angola     1967   NA      0.167   NA         NA      NA   NA       7.40
##  7    61 Antigua …  1967   NA      9.11    NA         NA      NA   NA       4.04
##  8    71 Arab Wor…  1967   NA      1.33    31.3       NA      NA   NA       6.93
##  9    81 Argentina  1967    9.98   2.86    18.7       NA      NA    7.50    3.05
## 10    91 Armenia    1967   NA     NA       NA         NA      NA   NA       3.61
## # … with 249 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## #   `Imports of goods and services (% of GDP)` <dbl>,
## #   `Industry, value added (% of GDP)` <dbl>,
## #   `Inflation, GDP deflator (annual %)` <dbl>,
## #   `Life expectancy at birth, total (years)` <dbl>,
## #   `Population density (people per sq. km of land area)` <dbl>,
## #   `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
strongest_year_plot <- ggplot(gapminder_strongest_year, 
  aes(x = gdpPercap, y = `CO2 emissions (metric tons per capita)`, 
  color = continent,
  size = pop)) +
  geom_point()
ggplotly(strongest_year_plot)

1

What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’? (stats test needed)

We want to test if there is a statistically significant difference in the Energy use per continent. Thus, continent is the predictor variable, and energy use is the outcome variable.

To use a parametric test, we must ensure that three assumptions are met: Normality, equal variances, and independence.

Normality assumption: To check for normality, we use a qqplot.

# ggqqplot(data = gapminder, x = `Energy use (kg of oil equivalent per capita)`, facet.by = 'continent')

ggplot(gapminder, aes(x = `Energy use (kg of oil equivalent per capita)`)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ continent, scales = "free") +
  xlab("Energy use (kg of oil equivalent per capita)") +
  ylab("Frequency")
## Warning: Removed 1197 rows containing non-finite values (`stat_bin()`).

As we can see, the data is not normally distributed. Therefore, we use a Kruskal-Wallis test

kruskal.test(gapminder$`Energy use (kg of oil equivalent per capita)`, gapminder$continent, na.action = "na.omit")
## 
##  Kruskal-Wallis rank sum test
## 
## data:  gapminder$`Energy use (kg of oil equivalent per capita)` and gapminder$continent
## Kruskal-Wallis chi-squared = 318.68, df = 4, p-value < 2.2e-16

As we can see, the p value is less than 2.2e-16, which is less than 0.05, which means that the energy use varies significantly between at least two continents.

2

Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed) ### Filtering the data

gapminder_years <- gapminder %>%
  filter(Year > 1990) %>%
  filter(continent == "Asia" | continent == "Europe")

Visualizations

# Create box plots

box_plot <- ggplot(gapminder_years, aes(x = continent, y = `Imports of goods and services (% of GDP)`, fill = continent)) +
  geom_boxplot() +
  labs(x = "Continent", y = "Imports of goods and services (% of GDP)", fill = "Continent") +
  ggtitle("Box Plots of GDP Imports by Continent")

ggplotly(box_plot)
## Warning: Removed 12 rows containing non-finite values (`stat_boxplot()`).
# Create density plots
density_plot <- ggplot(gapminder_years, aes(x = `Imports of goods and services (% of GDP)`, fill = continent)) +
  geom_density(alpha = 0.5) +
  labs(x = "Imports of goods and services (% of GDP)", fill = "Continent") +
  ggtitle("Density Plots of GDP Imports by Continent")

ggplotly(density_plot)
## Warning: Removed 12 rows containing non-finite values (`stat_density()`).

Visually, the two continent’s import of goods and services are very close with overlapping peaks, although the variances appear to be different and there appears to be 4 outliers in Asia.

Plotting a qqplot

# Filter data for the years after 1990
data <- gapminder_years

# Plot Q-Q plot with facet by continent
ggplotly(ggqqplot(data = gapminder_years, x = "`Imports of goods and services (% of GDP)`", facet.by = "continent"))
## Warning: Removed 12 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 12 rows containing non-finite values (`stat_qq_line()`).
## Removed 12 rows containing non-finite values (`stat_qq_line()`).

For Asia, there are a few points with a high GDP above the diagonal line. As normality has been violated, it would not be appropriate to use a parametric test, so we use the non-parametric Mann-Whitney-Wilcoxon Test.

Perform Mann-Whitney-Wilcoxon Test

result <- wilcox.test(`Imports of goods and services (% of GDP)` ~ continent, data = gapminder_years)
print(result)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Imports of goods and services (% of GDP) by continent
## W = 5707, p-value = 0.7867
## alternative hypothesis: true location shift is not equal to 0

As the p-value is greater than 0.05, we did not find a significant difference in ‘Imports of goods and services (% of GDP)’ between Europe and Asia

3 What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

gapminder_pd <- gapminder %>%
  group_by(`Country Name`) %>%
  summarize(`Mean population density` = mean(`Population density (people per sq. km of land area)`, , na.rm = TRUE))  %>%
  arrange(desc(`Mean population density`)) %>%
  slice_head(n = 5)

datatable(gapminder_pd)
mean_pd_plot <- gapminder_pd %>%
  ggplot(aes(x = `Country Name`, y = `Mean population density`)) +
  ggtitle('Mean population density of the top 5 countries') +
  geom_bar(stat = 'Identity')
ggplotly(mean_pd_plot)
gapminder_dense <- gapminder_pd %>% slice(1)

colnames(gapminder_dense)[1] <- "Country with the most population density"

datatable(gapminder_dense)

As seen from bar chart, Macao SAR, China bas the greatest population density across the years, with a mean population density of 14732.04 people per square km.

4 What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' between 1962 and 2007?

# Get the top 5 countries with the greatest increase in life expectancies
gapminder_difference <-gapminder %>%
  filter(Year %in% c(1962, 2007)) %>%
  group_by(`Country Name`)%>%
  arrange((`Life expectancy at birth, total (years)`)) %>%
  reframe(`Difference in Life expectancy (2007 - 1962)` = diff(`Life expectancy at birth, total (years)`)) %>%
  arrange(desc(`Difference in Life expectancy (2007 - 1962)`)) %>%
  slice(1:5)

gapminder_difference_plot <- gapminder_difference %>%
  ggplot(aes(x = `Country Name`,  y = `Difference in Life expectancy (2007 - 1962)`)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 5 countries with the greatest increase in life expectancy from 1962 to 2007", y = 'Difference in Life expectancy in years for (2007 - 1962)')

ggplotly(gapminder_difference_plot)

As seen from the above bar chart, the country whose life expectancy increased the most from 1962 - 2007 for Maldives